#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May  3 09:16:28 2022

@author: cythnia
"""

#————————————————————————————————————————————#
#爬虫练习：爬虫年报并且解析（关键词词频、词云）
#————————————————————————————————————————————#
#导入工具包
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.action_chains import ActionChains #模拟鼠标操纵
from selenium.webdriver.common.keys import Keys #模拟键盘操作
import os
#读取存取的股票代码
data=pd.read_excel('/Users/cythnia/Desktop/1.xlsx')
data
lis1=data['code1']
print(lis1)
# 分裂股票代码
lis1=lis1.str.split('s').str[0]
lis1
#创建以公司代码为名称的文件夹
# path1='/Users/cythnia/Desktop/年报' #文件夹路径
# part_name=lis1#文件夹名列表
# for i in part_name:
#     a=os.makedirs(path1+'/{}'.format(i))
# #设置爬取的url
url='http://www.cninfo.com.cn/new/index' #巨潮资讯网首页
headers={
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
    }

#设置驱动浏览器
driver=webdriver.Chrome()#一定要加括号
driver.get(url)
driver.implicitly_wait(8)#等待八秒钟再进行操作
#提取股票代码
for i in lis1[]:
    #找到搜索栏
    sousuo=driver.find_element_by_css_selector('body > header > div > div.search-wrap > div > div > input')
    sousuo.clear()
    time.sleep(np.random.randint(2,5))
    sousuo.send_keys(i)
    time.sleep(2) 
    sousuo.send_keys(Keys.ENTER)
    time.sleep(5)
#转移页面到公司页面
    driver.switch_to.window(driver.window_handles[1])
    print('已经进入到'+str(i)+'公司界面')
    #选择分类选项
    fenlei=driver.find_element_by_css_selector('#main > div.jc-layout > div > div.el-col.el-col-21 > div > div > div.data-detail > div:nth-child(1) > div.bg-white.el-row > form > div.el-form-item.width-85.el-form-item--medium > div > span > button')
    fenlei.click()
    time.sleep(2)
    #选择年报选项
    nianbao=driver.find_element_by_class_name('el-checkbox__inner')
    nianbao.click()
    time.sleep(2)
    #选择年报时间
    shijian1=driver.find_element_by_css_selector('#main > div.jc-layout > div > div.el-col.el-col-21 > div > div > div.data-detail > div:nth-child(1) > div:nth-child(1) > form > div:nth-child(3) > div > div > input:nth-child(2)')
    shijian1.send_keys('2012-01-01')
    time.sleep(2)
    shijian2=driver.find_element_by_css_selector('#main > div.jc-layout > div > div.el-col.el-col-21 > div > div > div.data-detail > div:nth-child(1) > div:nth-child(1) > form > div:nth-child(3) > div > div > input:nth-child(4)')
    shijian2.send_keys('2022-05-01')
    time.sleep(3)
    shijian2.send_keys(Keys.ENTER)
    time.sleep(np.random.randint(3,5))
    #点击公司内年报搜索按钮(模拟键盘回车)
    # sousuo=driver.find_element_by_xpath('//*[@id="main"]/div[3]/div/div[2]/div/div/div[2]/div[1]/div[1]/form/div[4]/div')
    # time.sleep(3)
    # sousuo.click()
    #获取年报链接
    tr_list=driver.find_elements_by_css_selector('#main > div.jc-layout > div > div.el-col.el-col-21 > div > div > div.data-detail > div:nth-child(1) > div:nth-child(3) > div > div > div.el-table__body-wrapper.is-scrolling-none > table > tbody > tr > td.el-table_3_column_7 > div > a')
    for tr in tr_list:
        lianjie=tr.get_attribute('href')
        lianjie=lianjie.split('cn/')[1]
        name=tr.text
        if '摘要'  in name:
            pass
        elif '已取消' in name:
            pass        
        else: 
            xzlianjie='http://www.cninfo.com.cn/'+lianjie 
            print(xzlianjie)
            print('正在下载'+name+'年报')
    #获取内容
            r=requests.get(xzlianjie,headers=headers)
        #设置下载链接
            path='/Users/cythnia/Desktop/年报/%s/%s.pdf'%(i,name)
        #写进文件夹
            with open(path,'wb') as f:
                f.write(r.content)
                time.sleep(2)
                f.close()
                print(name+'下载成功')
    print(i+'所有年报下载成功')
    driver.close()
    driver.switch_to.window(driver.window_handles[0])
    time.sleep(5)
    
# #找到搜索框进行下一家公司
# sousuo1=driver.find_element_by_css_selector('body > header > div > div.search-wrap > div > div.el-input.el-input--small.el-input--suffix > input')
# #获取年报链接
# for i in lis1[1:]:
#     sousuo1.send_keys(i)
#     time.sleep(3) 
#     sousuo1.send_keys(Keys.ENTER)
#     time.sleep(3)
#     #选择年报选项
#     fenlei=driver.find_element_by_css_selector('#main > div.jc-layout > div > div.el-col.el-col-21 > div > div > div.data-detail > div:nth-child(1) > div.bg-white.el-row > form > div.el-form-item.width-85.el-form-item--medium > div > span > button')
#     fenlei.click()
#     #选择年报选项
#     driver.find_element_by_css_selector('#el-popover-4705 > div.el-checkbox-group > label:nth-child(1) > span.el-checkbox__input > span').click()
#     time.sleep(2)
#     #选择年报时间
#     shijian1=driver.find_element_by_css_selector('#main > div.jc-layout > div > div.el-col.el-col-21 > div > div > div.data-detail > div:nth-child(1) > div:nth-child(1) > form > div:nth-child(3) > div > div > input:nth-child(2)')
#     shijian1.send_keys('2018')
#     time.sleep(2)
#     shijian2=driver.find_element_by_css_selector('#main > div.jc-layout > div > div.el-col.el-col-21 > div > div > div.data-detail > div:nth-child(1) > div:nth-child(1) > form > div:nth-child(3) > div > div > input:nth-child(4)').click()
#     shijian2.send_keys('2021')
#     time.sleep(2)
#     #点击公司内年报搜索按钮
#     sousuo1=driver.find_element_by_css_selector('#main > div.jc-layout > div > div.el-col.el-col-21 > div > div > div.data-detail > div:nth-child(1) > div:nth-child(1) > form > div:nth-child(4) > div > button > span').click()
#     time.sleep(3)
#     tr_list=driver.find_elements_by_css_selector('#main > div.jc-layout > div > div.el-col.el-col-21 > div > div > div.data-detail > div:nth-child(1) > div:nth-child(3) > div > div > div.el-table__body-wrapper.is-scrolling-none > table > tbody > tr > td.el-table_3_column_7 > div > a')
#     for tr in tr_list:
#         lianjie=tr.get_attribute('href')
#         lianjie=lianjie.split('cn/')[1]
#         name=tr.text
#         xzlianjie='http://www.cninfo.com.cn/'+lianjie
#         print(xzlianjie)
#         print('正在下载'+name+'年报')
# #获取内容
#         r=requests.get(xzlianjie,headers=headers)
# #设置下载链接
#         path='/Users/cythnia/Desktop/lunwen/%s.pdf'%name
# #写进文件夹
#         with open(path,'wb') as f:
#             f.write(r.content)
#             time.sleep(3)
#             f.close()
#             print(name+'下载成功')
#             time.sleep(3)         
            
            
    
